import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib
data = pd.read_csv("E:/insurance.csv")
data.head(5)
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
data.age.describe()
count 1338.000000 mean 39.207025 std 14.049960 min 18.000000 25% 27.000000 50% 39.000000 75% 51.000000 max 64.000000 Name: age, dtype: float64
import pandas as pd
data.smoker.value_counts()
no 1064 yes 274 Name: smoker, dtype: int64
px.histogram(data, x='smoker', color='sex', title='Smoker')
fig = px.histogram(data,
x='age',
marginal='box',
nbins=47,
title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(data,
x='bmi',
marginal='box',
color_discrete_sequence=['blue'],
title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.histogram(data,
x ='charges',
marginal='box',
color='smoker',
color_discrete_sequence=['green','blue'],
title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
fig = px.scatter(data,
x='age',
y='charges',
color='smoker',
hover_data=['sex'],
title='Age vs. Charges')
fig.update_traces(marker_size=5) #to emphasize each point with a specifice maker
fig.show()
fig = px.scatter(data,
x='bmi',
y='charges',
color='smoker',
hover_data=['sex'],
title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
data.charges.corr(data.age)
0.2990081933306474
data.charges.corr(data.bmi)
0.19834096883362903
data.children.corr(data.charges)
0.0679982268479047
data.corr()
C:\Users\Rajalaxmi Mohapatra\AppData\Local\Temp\ipykernel_10460\2627137660.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
| age | bmi | children | charges | |
|---|---|---|---|---|
| age | 1.000000 | 0.109272 | 0.042469 | 0.299008 |
| bmi | 0.109272 | 1.000000 | 0.012759 | 0.198341 |
| children | 0.042469 | 0.012759 | 1.000000 | 0.067998 |
| charges | 0.299008 | 0.198341 | 0.067998 | 1.000000 |
sns.heatmap(data.corr(),cmap = 'Blues',annot = True)
plt.title = ("Co-reletation_matrix")
C:\Users\Rajalaxmi Mohapatra\AppData\Local\Temp\ipykernel_10460\3440050043.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
def estimate_charges(age, w, b):
return w * age + b
##Let define a helper function estimate_charges, to compute charges , given age, w and b
##In the above case, the x axis shows "age" and the y axis shows "charges". Thus, we're assume the following relationship between the two:
##charges = w×age+b
w = 50
b = 100
estimate_charges(30,w,b)
1600
non_smoker_df = data[data.smoker == 'no']
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages, w, b)
estimated_charges
1 1000
2 1500
3 1750
4 1700
5 1650
...
1332 2700
1333 2600
1334 1000
1335 1000
1336 1150
Name: age, Length: 1064, dtype: int64
non_smoker_df.charges
1 1725.55230
2 4449.46200
3 21984.47061
4 3866.85520
5 3756.62160
...
1332 11411.68500
1333 10600.54830
1334 2205.98080
1335 1629.83350
1336 2007.94500
Name: charges, Length: 1064, dtype: float64
plt.scatter(ages, estimated_charges);
plt.plot(ages,estimated_charges)
plt.xlabel('Age');
plt.ylabel('Estimated Charges');
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
model = LinearRegression()
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape :', inputs.shape)
print('targes.shape :', targets.shape)
inputs.shape : (1064, 1) targes.shape : (1064,)
model.fit(inputs, targets)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
predictions = model.predict(inputs)
predictions
array([2719.0598744 , 5391.54900271, 6727.79356686, ..., 2719.0598744 ,
2719.0598744 , 3520.80661289])
import math
from sklearn.metrics import mean_squared_error
def rmse(targets, predictions):
return np.sqrt(np.mean(np.square(targets - predictions)))
rmse(targets, predictions)
4662.505766636395
model.coef_
array([267.24891283])
model.intercept_
-2091.4205565650827
model1 = SGDRegressor()
model1.fit(inputs,targets)
SGDRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SGDRegressor()
predictions = model1.predict(inputs)
rmse(targets, predictions)
5075.193593045272
#Model creation for smoker dataset
smoker_df = data[data.smoker == 'yes']
smoker_df
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 11 | 62 | female | 26.290 | 0 | yes | southeast | 27808.72510 |
| 14 | 27 | male | 42.130 | 0 | yes | southeast | 39611.75770 |
| 19 | 30 | male | 35.300 | 0 | yes | southwest | 36837.46700 |
| 23 | 34 | female | 31.920 | 1 | yes | northeast | 37701.87680 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1313 | 19 | female | 34.700 | 2 | yes | southwest | 36397.57600 |
| 1314 | 30 | female | 23.655 | 3 | yes | northwest | 18765.87545 |
| 1321 | 62 | male | 26.695 | 0 | yes | northeast | 28101.33305 |
| 1323 | 42 | female | 40.370 | 2 | yes | southeast | 43896.37630 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
274 rows × 7 columns
inputs = smoker_df[['age']]
targets = smoker_df.charges
print('',inputs.shape)
print('',targets.shape)
(274, 1) (274,)
model.fit(inputs,targets)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
predicted = model.predict(inputs)
print('RMSE:',)
rmse(targets,predicted)
RMSE:
10711.00334810241
sns.barplot(data = data, x='smoker', y='charges');
smoker_codes = {'no': 0, 'yes': 1}
data['smoker_code'] = data.smoker.map(smoker_codes)
data.charges.corr(data.smoker_code)
0.7872514304984761
data
| age | sex | bmi | children | smoker | region | charges | smoker_code | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 1 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 0 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 0 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 0 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 0 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 0 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 0 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 0 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 1 |
1338 rows × 8 columns
# Create inputs and targets
inputs, targets = data[['age', 'bmi', 'children', 'smoker_code']], data['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.439217188081
sns.barplot(data=data, x='sex', y='charges')
<Axes: xlabel='sex', ylabel='charges'>
sex_codes = {'female': 0, 'male': 1}
data['sex_code'] = data.sex.map(sex_codes)
data
| age | sex | bmi | children | smoker | region | charges | smoker_code | sex_code | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 1 | 0 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 0 | 1 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 0 | 1 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 0 | 1 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 0 | 1 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 0 | 0 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 0 | 0 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 0 | 0 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 1 | 0 |
1338 rows × 9 columns
data.charges.corr(data.sex_code)
0.057292062202025464
# Create inputs and targets
inputs, targets = data[['age', 'bmi', 'children', 'smoker_code', 'sex_code']], data['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.100708754546
sns.barplot(data = data, x='region', y='charges');
#Model Improvements
#Feature Scaling
#Recall that due to regulatory requirements, we also need to explain the rationale behind the predictions our model.
#charges=w1×age+w2×bmi+w3×children+w4×smoker+w5×sex+w6×region+b
#To compare the importance of each feature in the model, our first instinct might be to compare their weights.
model.coef_
array([ 257.73498767, 322.36421449, 474.41112061, 23823.39253065,
-128.63985357])
model.intercept_
-12052.461985664726
data
| age | sex | bmi | children | smoker | region | charges | smoker_code | sex_code | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 1 | 0 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 0 | 1 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 0 | 1 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 0 | 1 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 0 | 1 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 0 | 0 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 0 | 0 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 0 | 0 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 1 | 0 |
1338 rows × 9 columns
from sklearn.preprocessing import StandardScaler
numeric_cols = ['age', 'bmi', 'children']
scaler = StandardScaler()
scaler.fit(data[numeric_cols])
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
scaler.mean_
array([39.20702541, 30.66339686, 1.09491779])
scaler.var_
array([197.25385199, 37.16008997, 1.45212664])
data[numeric_cols]
| age | bmi | children | |
|---|---|---|---|
| 0 | 19 | 27.900 | 0 |
| 1 | 18 | 33.770 | 1 |
| 2 | 28 | 33.000 | 3 |
| 3 | 33 | 22.705 | 0 |
| 4 | 32 | 28.880 | 0 |
| ... | ... | ... | ... |
| 1333 | 50 | 30.970 | 3 |
| 1334 | 18 | 31.920 | 0 |
| 1335 | 18 | 36.850 | 0 |
| 1336 | 21 | 25.800 | 0 |
| 1337 | 61 | 29.070 | 0 |
1338 rows × 3 columns
scaled_inputs = scaler.transform(data[numeric_cols])
scaled_inputs
array([[-1.43876426, -0.45332 , -0.90861367],
[-1.50996545, 0.5096211 , -0.07876719],
[-0.79795355, 0.38330685, 1.58092576],
...,
[-1.50996545, 1.0148781 , -0.90861367],
[-1.29636188, -0.79781341, -0.90861367],
[ 1.55168573, -0.26138796, -0.90861367]])
from sklearn.model_selection import train_test_split
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.2,random_state=0)
# Create and train the model
model = LinearRegression().fit(inputs_train, targets_train)
# Generate predictions
predictions_test = model.predict(inputs_test)
# Compute loss to evalute the model
loss = rmse(targets_test, predictions_test)
print('Test Loss:', loss)
Test Loss: 5671.492452926755
# Generate predictions
predictions_train = model.predict(inputs_train)
# Compute loss to evalute the model
loss = rmse(targets_train, predictions_train)
print('Training Loss:', loss)
Training Loss: 6150.508349257895